import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
df=pd.read_csv(r'C:\Users\TajwarAbtahee\OneDrive - JCW Resourcing\Desktop\Python\practice\amazon.csv')
df.head()
| year | state | month | number | date | |
|---|---|---|---|---|---|
| 0 | 1998 | Acre | Janeiro | 0.0 | 1998-01-01 |
| 1 | 1999 | Acre | Janeiro | 0.0 | 1999-01-01 |
| 2 | 2000 | Acre | Janeiro | 0.0 | 2000-01-01 |
| 3 | 2001 | Acre | Janeiro | 0.0 | 2001-01-01 |
| 4 | 2002 | Acre | Janeiro | 0.0 | 2002-01-01 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6454 entries, 0 to 6453 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 6454 non-null int64 1 state 6454 non-null object 2 month 6454 non-null object 3 number 6454 non-null float64 4 date 6454 non-null object dtypes: float64(1), int64(1), object(3) memory usage: 252.2+ KB
df.month.value_counts()
Janeiro 541 Outubro 540 Julho 540 Novembro 540 Junho 540 Agosto 540 Maio 540 Mar�o 540 Setembro 540 Abril 540 Fevereiro 540 Dezembro 513 Name: month, dtype: int64
df['month']=df.month.map({'Janeiro':'January',
'Fevereiro':'February',
'Mar�o':'March',
'Abril':'April',
'Maio':'May',
'Junho':'June',
'Julho':'July',
'Agosto':'August',
'Setembro':'September',
'Outubro':'October',
'Novembro':'November',
'Dezembro':'December'})
df.month.value_counts()
January 541 August 540 March 540 November 540 April 540 June 540 September 540 July 540 May 540 February 540 October 540 December 513 Name: month, dtype: int64
df.date=pd.to_datetime(df.date)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6454 entries, 0 to 6453 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 6454 non-null int64 1 state 6454 non-null object 2 month 6454 non-null object 3 number 6454 non-null float64 4 date 6454 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), int64(1), object(2) memory usage: 252.2+ KB
df.describe()
| year | number | |
|---|---|---|
| count | 6454.000000 | 6454.000000 |
| mean | 2007.461729 | 108.293163 |
| std | 5.746654 | 190.812242 |
| min | 1998.000000 | 0.000000 |
| 25% | 2002.000000 | 3.000000 |
| 50% | 2007.000000 | 24.000000 |
| 75% | 2012.000000 | 113.000000 |
| max | 2017.000000 | 998.000000 |
years=df.groupby('year').sum().reset_index()
years.columns
Index(['year', 'number'], dtype='object')
plt.figure(figsize=(20,10))
sns.lineplot(data=years,x='year',y='number',marker='o').set(xticks=years.year)
plt.show()
px.line(years,x='year',y='number') #decline for 6 year from 2003, and then a steady rise from 2008 onwards
states=df.groupby('state').sum().sort_values('number',ascending=False).reset_index()
px.bar(states,x='state',y='number',text_auto='.2s')
#Mato Grasso has almost double the amount of cases compared to each region in the top 5
months=df.groupby('month').mean().reset_index()
months=months.reindex([4,3,7,0,8,6,5,1,11,10,9,2])
px.line(months,x='month',y='number')
#Feb to May can be considered as the more calm times of the year but then drastically rise during the summer time and winter time cooling off after november, with an exceptopn of a dry spell in september
df.head()
| year | state | month | number | date | |
|---|---|---|---|---|---|
| 0 | 1998 | Acre | January | 0.0 | 1998-01-01 |
| 1 | 1999 | Acre | January | 0.0 | 1999-01-01 |
| 2 | 2000 | Acre | January | 0.0 | 2000-01-01 |
| 3 | 2001 | Acre | January | 0.0 | 2001-01-01 |
| 4 | 2002 | Acre | January | 0.0 | 2002-01-01 |
state_year=df.groupby(['year','state']).sum().reset_index()
state_year
| year | state | number | |
|---|---|---|---|
| 0 | 1998 | Acre | 730.000 |
| 1 | 1998 | Alagoas | 86.000 |
| 2 | 1998 | Amapa | 278.000 |
| 3 | 1998 | Amazonas | 946.000 |
| 4 | 1998 | Bahia | 1224.687 |
| ... | ... | ... | ... |
| 455 | 2017 | Roraima | 1101.000 |
| 456 | 2017 | Santa Catarina | 2354.000 |
| 457 | 2017 | Sao Paulo | 2540.868 |
| 458 | 2017 | Sergipe | 75.000 |
| 459 | 2017 | Tocantins | 1378.959 |
460 rows × 3 columns
px.line(state_year,x='year',y='number',color='state')
#Mato Grasso as mentioned previously has had a higher case rate from 1998 compared to all other regions
#areas such as Sergepie and Alagaoshave a consistent low rate thoughout the various years
#lets find the average number of fires
case_ave=df.groupby('state').mean().sort_values('number',ascending=False).reset_index()
px.bar(case_ave,x='state',y='number',text_auto='.3s')
#although Mato Grasso had the highest number of cases it can be seen that Sao Paulo has the highest numebr of cases day to day basis